
package org.apache.solr.spelling;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;


/**
 * Converts the query string to a Collection of Lucene tokens using a regular expression.
 * Boolean operators AND, OR, NOT are skipped. 
 * 
 * Each term is checked to determine if it is optional, required or prohibited.  Required
 * terms output a {@link Token} with the {@link QueryConverter#REQUIRED_TERM_FLAG} set.
 * Prohibited terms output a {@link Token} with the {@link QueryConverter#PROHIBITED_TERM_FLAG} 
 * set. If the query uses the plus (+) and minus (-) to denote required and prohibited, this
 * determination will be accurate.  In the case boolean AND/OR/NOTs are used, this
 * converter makes an uninformed guess as to whether the term would likely behave as if it
 * is Required or Prohibited and sets the flags accordingly.  These flags are used downstream
 * to generate collations for {@link WordBreakSolrSpellChecker}, in cases where an original 
 * term is split up into multiple Tokens.
 * 
 * @since solr 1.3
 **/
public class SpellingQueryConverter extends QueryConverter {

    /*
     * The following builds up a regular expression that matches productions
     * of the syntax for NMTOKEN as per the W3C XML Recommendation - with one
     * important exception (see below).
     *
     * http://www.w3.org/TR/2008/REC-xml-20081126/ - version used as reference
     *
     * http://www.w3.org/TR/REC-xml/#NT-Nmtoken
     *
     * An NMTOKEN is a series of one or more NAMECHAR characters, which is an
     * extension of the NAMESTARTCHAR character class.
     *
     * The EXCEPTION referred to above concerns the colon, which is legal in an
     * NMTOKEN, but cannot currently be used as a valid field name within Solr,
     * as it is used to delimit the field name from the query string.
     */
    final static String[] NAMESTARTCHAR_PARTS = {
        "A-Z_a-z", "\\xc0-\\xd6", "\\xd8-\\xf6", "\\xf8-\\u02ff",
        "\\u0370-\\u037d", "\\u037f-\\u1fff",
        "\\u200c-\\u200d", "\\u2070-\\u218f",
        "\\u2c00-\\u2fef", "\\u2001-\\ud7ff",
        "\\uf900-\\ufdcf", "\\ufdf0-\\ufffd"
    };
    final static String[] ADDITIONAL_NAMECHAR_PARTS = {
        "\\-.0-9\\xb7", "\\u0300-\\u036f", "\\u203f-\\u2040"
    };
    final static String SURROGATE_PAIR = "\\p{Cs}{2}";
    final static String NMTOKEN;

    static {
        StringBuilder sb = new StringBuilder();
        for (String part : NAMESTARTCHAR_PARTS) {
            sb.append(part);
        }
        for (String part : ADDITIONAL_NAMECHAR_PARTS) {
            sb.append(part);
        }
        NMTOKEN = "([" + sb.toString() + "]|" + SURROGATE_PAIR + ")+";
    }
    final static String PATTERN = "(?:(?!(" + NMTOKEN + ":|\\d+)))[\\p{L}_\\-0-9]+";
    // previous version: Pattern.compile("(?:(?!(\\w+:|\\d+)))\\w+");
    protected Pattern QUERY_REGEX = Pattern.compile(PATTERN);

    /**
     * Converts the original query string to a collection of Lucene Tokens.
     *
     * @param original the original query string
     * @return a Collection of Lucene Tokens
     */
    @Override
    public Collection<Token> convert(String original) {

        if (original == null) { // this can happen with q.alt = and no query
            return Collections.emptyList();
        }

        Collection<Token> result = new ArrayList<>();
        Matcher matcher = QUERY_REGEX.matcher(original);
        String nextWord = null;
        int nextStartIndex = 0;
        String lastBooleanOp = null;
        while (nextWord != null || matcher.find()) {
            String word;
            int startIndex = 0;
            if (nextWord != null) {
                word = nextWord;
                startIndex = nextStartIndex;
                nextWord = null;
            }
            else {
                word = matcher.group(0);
                startIndex = matcher.start();
            }
            if (matcher.find()) {
                nextWord = matcher.group(0);
                nextStartIndex = matcher.start();
            }
            if ("AND".equals(word) || "OR".equals(word) || "NOT".equals(word)) {
                lastBooleanOp = word;
                continue;
            }
            // treat "AND NOT" as "NOT"...
            if ("AND".equals(nextWord)
                    && original.length() > nextStartIndex + 7
                    && original.substring(nextStartIndex, nextStartIndex + 7).equals(
                    "AND NOT")) {
                nextWord = "NOT";
            }

            int flagValue = 0;
            if (word.charAt(0) == '-'
                    || (startIndex > 0 && original.charAt(startIndex - 1) == '-')) {
                flagValue = PROHIBITED_TERM_FLAG;
            }
            else if (word.charAt(0) == '+' || (startIndex > 0 && original.charAt(startIndex - 1) == '+')) {
                flagValue = REQUIRED_TERM_FLAG;
                //we don't know the default operator so just assume the first operator isn't new.
            }
            else if (nextWord != null && lastBooleanOp != null && !nextWord.equals(lastBooleanOp)
                    && ("AND".equals(nextWord) || "OR".equals(nextWord) || "NOT".equals(nextWord))) {
                flagValue = TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
                //...unless the 1st boolean operator is a NOT, because only AND/OR can be default.
            }
            else if (nextWord != null
                    && lastBooleanOp == null
                    && !nextWord.equals(lastBooleanOp)
                    && ("NOT".equals(nextWord))) {
                flagValue = TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
            }
            try {
                analyze(result, new StringReader(word), startIndex, flagValue);
            }
            catch (IOException e) {
                // TODO: shouldn't we log something?
            }
        }
        if (lastBooleanOp != null) {
            for (Token t : result) {
                int f = t.getFlags();
                t.setFlags(f |= QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG);
            }
        }
        return result;
    }

    protected void analyze(Collection<Token> result, Reader text, int offset, int flagsAttValue) throws IOException {

        try(TokenStream stream = analyzer.tokenStream("", text)) {

            CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
            TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
            PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
            PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
            OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                Token token = new Token();
                token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
                token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset());
                token.setFlags(flagsAttValue); //overwriting any flags already set...
                token.setType(typeAtt.type());
                token.setPayload(payloadAtt.getPayload());
                token.setPositionIncrement(posIncAtt.getPositionIncrement());
                result.add(token);
            }
            stream.end();
        }
    }
}
